{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Excel vs Pandas\n",
    "### 制作：刘早起\n",
    "### 公众号：早起Python"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据读取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>创建时间</th>\n",
       "      <th>地址</th>\n",
       "      <th>岗位</th>\n",
       "      <th>学历</th>\n",
       "      <th>技能要求</th>\n",
       "      <th>工作经验</th>\n",
       "      <th>薪资水平</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2020-03-16 12:28:11</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>['Hive', '数据挖掘', '数据分析']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>20000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2020-03-16 12:39:03</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析', '数据库']</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>13000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2020-03-16 12:01:37</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析', '商业']</td>\n",
       "      <td>应届毕业生</td>\n",
       "      <td>8000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2020-03-16 11:19:51</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['SQL', '数据分析', '数据库']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>20000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2020-03-16 11:18:49</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>大专</td>\n",
       "      <td>['数据分析']</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>8000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>820</th>\n",
       "      <td>2020-03-16 11:20:44</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>1年以下</td>\n",
       "      <td>11500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>821</th>\n",
       "      <td>2020-03-16 11:20:42</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>应届毕业生</td>\n",
       "      <td>4500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>822</th>\n",
       "      <td>2020-03-16 10:33:46</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['BI', '数据分析', 'SQL', '数据库']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>16500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>823</th>\n",
       "      <td>2020-03-16 11:20:43</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>不限</td>\n",
       "      <td>11500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>824</th>\n",
       "      <td>2020-03-16 10:37:24</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>不限</td>\n",
       "      <td>3000.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>825 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                   创建时间  地址    岗位  学历                          技能要求   工作经验  \\\n",
       "0   2020-03-16 12:28:11  广州  数据开发  本科      ['Hive', '数据挖掘', '数据分析']   3-5年   \n",
       "1   2020-03-16 12:39:03  广州  数据分析  本科               ['数据分析', '数据库']   1-3年   \n",
       "2   2020-03-16 12:01:37  广州  数据分析  本科                ['数据分析', '商业']  应届毕业生   \n",
       "3   2020-03-16 11:19:51  广州  数据分析  本科        ['SQL', '数据分析', '数据库']   3-5年   \n",
       "4   2020-03-16 11:18:49  广州  数据分析  大专                      ['数据分析']   1-3年   \n",
       "..                  ...  ..   ...  ..                           ...    ...   \n",
       "820 2020-03-16 11:20:44  深圳  数据分析  本科                            []   1年以下   \n",
       "821 2020-03-16 11:20:42  深圳  数据开发  本科                            []  应届毕业生   \n",
       "822 2020-03-16 10:33:46  深圳  数据分析  本科  ['BI', '数据分析', 'SQL', '数据库']   3-5年   \n",
       "823 2020-03-16 11:20:43  深圳  数据开发  本科                            []     不限   \n",
       "824 2020-03-16 10:37:24  深圳  数据分析  本科                            []     不限   \n",
       "\n",
       "        薪资水平  \n",
       "0    20000.0  \n",
       "1    13000.0  \n",
       "2     8000.0  \n",
       "3    20000.0  \n",
       "4     8000.0  \n",
       "..       ...  \n",
       "820  11500.0  \n",
       "821   4500.0  \n",
       "822  16500.0  \n",
       "823  11500.0  \n",
       "824   3000.0  \n",
       "\n",
       "[825 rows x 7 columns]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#将示例数据.xlsx放在该Notebook同一文件夹下\n",
    "pd.read_excel(\"示例数据.xlsx\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据生成"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.079080</td>\n",
       "      <td>0.923725</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.428485</td>\n",
       "      <td>0.623286</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.848364</td>\n",
       "      <td>0.269628</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.605603</td>\n",
       "      <td>0.977730</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.133412</td>\n",
       "      <td>0.153470</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.265754</td>\n",
       "      <td>0.009527</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0.517848</td>\n",
       "      <td>0.667246</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0.658738</td>\n",
       "      <td>0.689726</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0.987348</td>\n",
       "      <td>0.523666</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0.435445</td>\n",
       "      <td>0.058743</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          0         1\n",
       "0  0.079080  0.923725\n",
       "1  0.428485  0.623286\n",
       "2  0.848364  0.269628\n",
       "3  0.605603  0.977730\n",
       "4  0.133412  0.153470\n",
       "5  0.265754  0.009527\n",
       "6  0.517848  0.667246\n",
       "7  0.658738  0.689726\n",
       "8  0.987348  0.523666\n",
       "9  0.435445  0.058743"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.DataFrame(np.random.rand(10,2))\n",
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据保存"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.to_excel(\"测试数据.xlsx\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据筛选"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>创建时间</th>\n",
       "      <th>地址</th>\n",
       "      <th>岗位</th>\n",
       "      <th>学历</th>\n",
       "      <th>技能要求</th>\n",
       "      <th>工作经验</th>\n",
       "      <th>薪资水平</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2020-03-16 12:28:11</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>['Hive', '数据挖掘', '数据分析']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>20000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2020-03-16 12:39:03</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析', '数据库']</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>13000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2020-03-16 12:01:37</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析', '商业']</td>\n",
       "      <td>应届毕业生</td>\n",
       "      <td>8000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2020-03-16 11:19:51</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['SQL', '数据分析', '数据库']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>20000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2020-03-16 11:18:49</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>大专</td>\n",
       "      <td>['数据分析']</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>8000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>817</th>\n",
       "      <td>2020-03-16 11:20:42</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>11500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>819</th>\n",
       "      <td>2020-03-16 11:20:44</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>15000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>820</th>\n",
       "      <td>2020-03-16 11:20:44</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>1年以下</td>\n",
       "      <td>11500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>822</th>\n",
       "      <td>2020-03-16 10:33:46</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['BI', '数据分析', 'SQL', '数据库']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>16500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>823</th>\n",
       "      <td>2020-03-16 11:20:43</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>不限</td>\n",
       "      <td>11500.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>776 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                   创建时间  地址    岗位  学历                          技能要求   工作经验  \\\n",
       "0   2020-03-16 12:28:11  广州  数据开发  本科      ['Hive', '数据挖掘', '数据分析']   3-5年   \n",
       "1   2020-03-16 12:39:03  广州  数据分析  本科               ['数据分析', '数据库']   1-3年   \n",
       "2   2020-03-16 12:01:37  广州  数据分析  本科                ['数据分析', '商业']  应届毕业生   \n",
       "3   2020-03-16 11:19:51  广州  数据分析  本科        ['SQL', '数据分析', '数据库']   3-5年   \n",
       "4   2020-03-16 11:18:49  广州  数据分析  大专                      ['数据分析']   1-3年   \n",
       "..                  ...  ..   ...  ..                           ...    ...   \n",
       "817 2020-03-16 11:20:42  深圳  数据分析  本科                            []   1-3年   \n",
       "819 2020-03-16 11:20:44  深圳  数据分析  本科                            []   1-3年   \n",
       "820 2020-03-16 11:20:44  深圳  数据分析  本科                            []   1年以下   \n",
       "822 2020-03-16 10:33:46  深圳  数据分析  本科  ['BI', '数据分析', 'SQL', '数据库']   3-5年   \n",
       "823 2020-03-16 11:20:43  深圳  数据开发  本科                            []     不限   \n",
       "\n",
       "        薪资水平  \n",
       "0    20000.0  \n",
       "1    13000.0  \n",
       "2     8000.0  \n",
       "3    20000.0  \n",
       "4     8000.0  \n",
       "..       ...  \n",
       "817  11500.0  \n",
       "819  15000.0  \n",
       "820  11500.0  \n",
       "822  16500.0  \n",
       "823  11500.0  \n",
       "\n",
       "[776 rows x 7 columns]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_excel(\"示例数据.xlsx\")\n",
    "df[df['薪资水平']>5000]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据插入"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>创建时间</th>\n",
       "      <th>地址</th>\n",
       "      <th>岗位</th>\n",
       "      <th>学历</th>\n",
       "      <th>技能要求</th>\n",
       "      <th>工作经验</th>\n",
       "      <th>薪资水平</th>\n",
       "      <th>new_col</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2020-03-16 12:28:11</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>['Hive', '数据挖掘', '数据分析']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>20000.0</td>\n",
       "      <td>高</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2020-03-16 12:39:03</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析', '数据库']</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>13000.0</td>\n",
       "      <td>高</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2020-03-16 12:01:37</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析', '商业']</td>\n",
       "      <td>应届毕业生</td>\n",
       "      <td>8000.0</td>\n",
       "      <td>低</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2020-03-16 11:19:51</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['SQL', '数据分析', '数据库']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>20000.0</td>\n",
       "      <td>高</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2020-03-16 11:18:49</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>大专</td>\n",
       "      <td>['数据分析']</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>8000.0</td>\n",
       "      <td>低</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>820</th>\n",
       "      <td>2020-03-16 11:20:44</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>1年以下</td>\n",
       "      <td>11500.0</td>\n",
       "      <td>高</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>821</th>\n",
       "      <td>2020-03-16 11:20:42</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>应届毕业生</td>\n",
       "      <td>4500.0</td>\n",
       "      <td>低</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>822</th>\n",
       "      <td>2020-03-16 10:33:46</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['BI', '数据分析', 'SQL', '数据库']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>16500.0</td>\n",
       "      <td>高</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>823</th>\n",
       "      <td>2020-03-16 11:20:43</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>不限</td>\n",
       "      <td>11500.0</td>\n",
       "      <td>高</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>824</th>\n",
       "      <td>2020-03-16 10:37:24</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>不限</td>\n",
       "      <td>3000.0</td>\n",
       "      <td>低</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>825 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                   创建时间  地址    岗位  学历                          技能要求   工作经验  \\\n",
       "0   2020-03-16 12:28:11  广州  数据开发  本科      ['Hive', '数据挖掘', '数据分析']   3-5年   \n",
       "1   2020-03-16 12:39:03  广州  数据分析  本科               ['数据分析', '数据库']   1-3年   \n",
       "2   2020-03-16 12:01:37  广州  数据分析  本科                ['数据分析', '商业']  应届毕业生   \n",
       "3   2020-03-16 11:19:51  广州  数据分析  本科        ['SQL', '数据分析', '数据库']   3-5年   \n",
       "4   2020-03-16 11:18:49  广州  数据分析  大专                      ['数据分析']   1-3年   \n",
       "..                  ...  ..   ...  ..                           ...    ...   \n",
       "820 2020-03-16 11:20:44  深圳  数据分析  本科                            []   1年以下   \n",
       "821 2020-03-16 11:20:42  深圳  数据开发  本科                            []  应届毕业生   \n",
       "822 2020-03-16 10:33:46  深圳  数据分析  本科  ['BI', '数据分析', 'SQL', '数据库']   3-5年   \n",
       "823 2020-03-16 11:20:43  深圳  数据开发  本科                            []     不限   \n",
       "824 2020-03-16 10:37:24  深圳  数据分析  本科                            []     不限   \n",
       "\n",
       "        薪资水平 new_col  \n",
       "0    20000.0       高  \n",
       "1    13000.0       高  \n",
       "2     8000.0       低  \n",
       "3    20000.0       高  \n",
       "4     8000.0       低  \n",
       "..       ...     ...  \n",
       "820  11500.0       高  \n",
       "821   4500.0       低  \n",
       "822  16500.0       高  \n",
       "823  11500.0       高  \n",
       "824   3000.0       低  \n",
       "\n",
       "[825 rows x 8 columns]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bins = [0,10000,max(df['薪资水平'])]\n",
    "group_names = ['低','高']\n",
    "df['new_col'] = pd.cut(df['薪资水平'], bins, labels=group_names)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据删除"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>创建时间</th>\n",
       "      <th>地址</th>\n",
       "      <th>岗位</th>\n",
       "      <th>学历</th>\n",
       "      <th>技能要求</th>\n",
       "      <th>工作经验</th>\n",
       "      <th>薪资水平</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2020-03-16 12:28:11</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>['Hive', '数据挖掘', '数据分析']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>20000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2020-03-16 12:39:03</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析', '数据库']</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>13000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2020-03-16 12:01:37</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析', '商业']</td>\n",
       "      <td>应届毕业生</td>\n",
       "      <td>8000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2020-03-16 11:19:51</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['SQL', '数据分析', '数据库']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>20000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2020-03-16 11:18:49</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>大专</td>\n",
       "      <td>['数据分析']</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>8000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>820</th>\n",
       "      <td>2020-03-16 11:20:44</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>1年以下</td>\n",
       "      <td>11500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>821</th>\n",
       "      <td>2020-03-16 11:20:42</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>应届毕业生</td>\n",
       "      <td>4500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>822</th>\n",
       "      <td>2020-03-16 10:33:46</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['BI', '数据分析', 'SQL', '数据库']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>16500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>823</th>\n",
       "      <td>2020-03-16 11:20:43</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>不限</td>\n",
       "      <td>11500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>824</th>\n",
       "      <td>2020-03-16 10:37:24</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>不限</td>\n",
       "      <td>3000.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>825 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                   创建时间  地址    岗位  学历                          技能要求   工作经验  \\\n",
       "0   2020-03-16 12:28:11  广州  数据开发  本科      ['Hive', '数据挖掘', '数据分析']   3-5年   \n",
       "1   2020-03-16 12:39:03  广州  数据分析  本科               ['数据分析', '数据库']   1-3年   \n",
       "2   2020-03-16 12:01:37  广州  数据分析  本科                ['数据分析', '商业']  应届毕业生   \n",
       "3   2020-03-16 11:19:51  广州  数据分析  本科        ['SQL', '数据分析', '数据库']   3-5年   \n",
       "4   2020-03-16 11:18:49  广州  数据分析  大专                      ['数据分析']   1-3年   \n",
       "..                  ...  ..   ...  ..                           ...    ...   \n",
       "820 2020-03-16 11:20:44  深圳  数据分析  本科                            []   1年以下   \n",
       "821 2020-03-16 11:20:42  深圳  数据开发  本科                            []  应届毕业生   \n",
       "822 2020-03-16 10:33:46  深圳  数据分析  本科  ['BI', '数据分析', 'SQL', '数据库']   3-5年   \n",
       "823 2020-03-16 11:20:43  深圳  数据开发  本科                            []     不限   \n",
       "824 2020-03-16 10:37:24  深圳  数据分析  本科                            []     不限   \n",
       "\n",
       "        薪资水平  \n",
       "0    20000.0  \n",
       "1    13000.0  \n",
       "2     8000.0  \n",
       "3    20000.0  \n",
       "4     8000.0  \n",
       "..       ...  \n",
       "820  11500.0  \n",
       "821   4500.0  \n",
       "822  16500.0  \n",
       "823  11500.0  \n",
       "824   3000.0  \n",
       "\n",
       "[825 rows x 7 columns]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "del df['new_col']\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据排序"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>创建时间</th>\n",
       "      <th>地址</th>\n",
       "      <th>岗位</th>\n",
       "      <th>学历</th>\n",
       "      <th>技能要求</th>\n",
       "      <th>工作经验</th>\n",
       "      <th>薪资水平</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2020-03-16 12:28:11</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>['Hive', '数据挖掘', '数据分析']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>20000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2020-03-16 12:39:03</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析', '数据库']</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>13000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2020-03-16 12:01:37</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析', '商业']</td>\n",
       "      <td>应届毕业生</td>\n",
       "      <td>8000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2020-03-16 11:19:51</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['SQL', '数据分析', '数据库']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>20000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2020-03-16 11:18:49</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>大专</td>\n",
       "      <td>['数据分析']</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>8000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>820</th>\n",
       "      <td>2020-03-16 11:20:44</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>1年以下</td>\n",
       "      <td>11500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>821</th>\n",
       "      <td>2020-03-16 11:20:42</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>应届毕业生</td>\n",
       "      <td>4500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>822</th>\n",
       "      <td>2020-03-16 10:33:46</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['BI', '数据分析', 'SQL', '数据库']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>16500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>823</th>\n",
       "      <td>2020-03-16 11:20:43</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>不限</td>\n",
       "      <td>11500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>824</th>\n",
       "      <td>2020-03-16 10:37:24</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>不限</td>\n",
       "      <td>3000.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>825 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                   创建时间  地址    岗位  学历                          技能要求   工作经验  \\\n",
       "0   2020-03-16 12:28:11  广州  数据开发  本科      ['Hive', '数据挖掘', '数据分析']   3-5年   \n",
       "1   2020-03-16 12:39:03  广州  数据分析  本科               ['数据分析', '数据库']   1-3年   \n",
       "2   2020-03-16 12:01:37  广州  数据分析  本科                ['数据分析', '商业']  应届毕业生   \n",
       "3   2020-03-16 11:19:51  广州  数据分析  本科        ['SQL', '数据分析', '数据库']   3-5年   \n",
       "4   2020-03-16 11:18:49  广州  数据分析  大专                      ['数据分析']   1-3年   \n",
       "..                  ...  ..   ...  ..                           ...    ...   \n",
       "820 2020-03-16 11:20:44  深圳  数据分析  本科                            []   1年以下   \n",
       "821 2020-03-16 11:20:42  深圳  数据开发  本科                            []  应届毕业生   \n",
       "822 2020-03-16 10:33:46  深圳  数据分析  本科  ['BI', '数据分析', 'SQL', '数据库']   3-5年   \n",
       "823 2020-03-16 11:20:43  深圳  数据开发  本科                            []     不限   \n",
       "824 2020-03-16 10:37:24  深圳  数据分析  本科                            []     不限   \n",
       "\n",
       "        薪资水平  \n",
       "0    20000.0  \n",
       "1    13000.0  \n",
       "2     8000.0  \n",
       "3    20000.0  \n",
       "4     8000.0  \n",
       "..       ...  \n",
       "820  11500.0  \n",
       "821   4500.0  \n",
       "822  16500.0  \n",
       "823  11500.0  \n",
       "824   3000.0  \n",
       "\n",
       "[825 rows x 7 columns]"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1 = df\n",
    "df1.sort_values(\"薪资水平\",ascending=False)\n",
    "df1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 缺失值处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "创建时间    0\n",
       "地址      0\n",
       "岗位      0\n",
       "学历      0\n",
       "技能要求    0\n",
       "工作经验    0\n",
       "薪资水平    2\n",
       "dtype: int64"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>创建时间</th>\n",
       "      <th>地址</th>\n",
       "      <th>岗位</th>\n",
       "      <th>学历</th>\n",
       "      <th>技能要求</th>\n",
       "      <th>工作经验</th>\n",
       "      <th>薪资水平</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2020-03-16 12:28:11</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>['Hive', '数据挖掘', '数据分析']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>20000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2020-03-16 12:39:03</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析', '数据库']</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>13000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2020-03-16 12:01:37</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析', '商业']</td>\n",
       "      <td>应届毕业生</td>\n",
       "      <td>8000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2020-03-16 11:19:51</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['SQL', '数据分析', '数据库']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>20000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2020-03-16 11:18:49</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>大专</td>\n",
       "      <td>['数据分析']</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>8000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>820</th>\n",
       "      <td>2020-03-16 11:20:44</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>1年以下</td>\n",
       "      <td>11500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>821</th>\n",
       "      <td>2020-03-16 11:20:42</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>应届毕业生</td>\n",
       "      <td>4500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>822</th>\n",
       "      <td>2020-03-16 10:33:46</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['BI', '数据分析', 'SQL', '数据库']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>16500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>823</th>\n",
       "      <td>2020-03-16 11:20:43</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>不限</td>\n",
       "      <td>11500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>824</th>\n",
       "      <td>2020-03-16 10:37:24</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>不限</td>\n",
       "      <td>3000.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>825 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                   创建时间  地址    岗位  学历                          技能要求   工作经验  \\\n",
       "0   2020-03-16 12:28:11  广州  数据开发  本科      ['Hive', '数据挖掘', '数据分析']   3-5年   \n",
       "1   2020-03-16 12:39:03  广州  数据分析  本科               ['数据分析', '数据库']   1-3年   \n",
       "2   2020-03-16 12:01:37  广州  数据分析  本科                ['数据分析', '商业']  应届毕业生   \n",
       "3   2020-03-16 11:19:51  广州  数据分析  本科        ['SQL', '数据分析', '数据库']   3-5年   \n",
       "4   2020-03-16 11:18:49  广州  数据分析  大专                      ['数据分析']   1-3年   \n",
       "..                  ...  ..   ...  ..                           ...    ...   \n",
       "820 2020-03-16 11:20:44  深圳  数据分析  本科                            []   1年以下   \n",
       "821 2020-03-16 11:20:42  深圳  数据开发  本科                            []  应届毕业生   \n",
       "822 2020-03-16 10:33:46  深圳  数据分析  本科  ['BI', '数据分析', 'SQL', '数据库']   3-5年   \n",
       "823 2020-03-16 11:20:43  深圳  数据开发  本科                            []     不限   \n",
       "824 2020-03-16 10:37:24  深圳  数据分析  本科                            []     不限   \n",
       "\n",
       "        薪资水平  \n",
       "0    20000.0  \n",
       "1    13000.0  \n",
       "2     8000.0  \n",
       "3    20000.0  \n",
       "4     8000.0  \n",
       "..       ...  \n",
       "820  11500.0  \n",
       "821   4500.0  \n",
       "822  16500.0  \n",
       "823  11500.0  \n",
       "824   3000.0  \n",
       "\n",
       "[825 rows x 7 columns]"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1 = df\n",
    "df1 = df1.fillna(axis=0,method='ffill')\n",
    "df1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据去重"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>创建时间</th>\n",
       "      <th>地址</th>\n",
       "      <th>岗位</th>\n",
       "      <th>学历</th>\n",
       "      <th>技能要求</th>\n",
       "      <th>工作经验</th>\n",
       "      <th>薪资水平</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2020-03-16 12:28:11</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>['Hive', '数据挖掘', '数据分析']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>20000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2020-03-16 12:39:03</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析', '数据库']</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>13000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2020-03-16 12:01:37</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析', '商业']</td>\n",
       "      <td>应届毕业生</td>\n",
       "      <td>8000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2020-03-16 11:19:51</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['SQL', '数据分析', '数据库']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>20000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2020-03-16 11:18:49</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>大专</td>\n",
       "      <td>['数据分析']</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>8000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>811</th>\n",
       "      <td>2020-03-16 11:03:17</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>['Hive', '数据分析', '数据仓库', '信息安全']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>37000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>812</th>\n",
       "      <td>2020-03-16 11:03:03</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据挖掘', '数据分析']</td>\n",
       "      <td>不限</td>\n",
       "      <td>30000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>815</th>\n",
       "      <td>2020-03-16 11:00:56</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['BI', '数据分析', 'SQL', '数据库']</td>\n",
       "      <td>5-10年</td>\n",
       "      <td>45000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>817</th>\n",
       "      <td>2020-03-16 11:20:42</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>11500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>824</th>\n",
       "      <td>2020-03-16 10:37:24</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>不限</td>\n",
       "      <td>3000.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>629 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                   创建时间  地址    岗位  学历                              技能要求  \\\n",
       "0   2020-03-16 12:28:11  广州  数据开发  本科          ['Hive', '数据挖掘', '数据分析']   \n",
       "1   2020-03-16 12:39:03  广州  数据分析  本科                   ['数据分析', '数据库']   \n",
       "2   2020-03-16 12:01:37  广州  数据分析  本科                    ['数据分析', '商业']   \n",
       "3   2020-03-16 11:19:51  广州  数据分析  本科            ['SQL', '数据分析', '数据库']   \n",
       "4   2020-03-16 11:18:49  广州  数据分析  大专                          ['数据分析']   \n",
       "..                  ...  ..   ...  ..                               ...   \n",
       "811 2020-03-16 11:03:17  深圳  数据开发  本科  ['Hive', '数据分析', '数据仓库', '信息安全']   \n",
       "812 2020-03-16 11:03:03  深圳  数据开发  本科                  ['数据挖掘', '数据分析']   \n",
       "815 2020-03-16 11:00:56  深圳  数据分析  本科      ['BI', '数据分析', 'SQL', '数据库']   \n",
       "817 2020-03-16 11:20:42  深圳  数据分析  本科                                []   \n",
       "824 2020-03-16 10:37:24  深圳  数据分析  本科                                []   \n",
       "\n",
       "      工作经验     薪资水平  \n",
       "0     3-5年  20000.0  \n",
       "1     1-3年  13000.0  \n",
       "2    应届毕业生   8000.0  \n",
       "3     3-5年  20000.0  \n",
       "4     1-3年   8000.0  \n",
       "..     ...      ...  \n",
       "811   3-5年  37000.0  \n",
       "812     不限  30000.0  \n",
       "815  5-10年  45000.0  \n",
       "817   1-3年  11500.0  \n",
       "824     不限   3000.0  \n",
       "\n",
       "[629 rows x 7 columns]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.drop_duplicates(['创建时间'],inplace=True)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 格式修改"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>创建时间</th>\n",
       "      <th>地址</th>\n",
       "      <th>岗位</th>\n",
       "      <th>学历</th>\n",
       "      <th>技能要求</th>\n",
       "      <th>工作经验</th>\n",
       "      <th>薪资水平</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>['Hive', '数据挖掘', '数据分析']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>20000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析', '数据库']</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>13000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析', '商业']</td>\n",
       "      <td>应届毕业生</td>\n",
       "      <td>8000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['SQL', '数据分析', '数据库']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>20000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>大专</td>\n",
       "      <td>['数据分析']</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>8000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>811</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>['Hive', '数据分析', '数据仓库', '信息安全']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>37000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>812</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据挖掘', '数据分析']</td>\n",
       "      <td>不限</td>\n",
       "      <td>30000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>815</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['BI', '数据分析', 'SQL', '数据库']</td>\n",
       "      <td>5-10年</td>\n",
       "      <td>45000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>817</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>11500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>824</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>不限</td>\n",
       "      <td>3000.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>629 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           创建时间  地址    岗位  学历                              技能要求   工作经验  \\\n",
       "0    2020-03-16  广州  数据开发  本科          ['Hive', '数据挖掘', '数据分析']   3-5年   \n",
       "1    2020-03-16  广州  数据分析  本科                   ['数据分析', '数据库']   1-3年   \n",
       "2    2020-03-16  广州  数据分析  本科                    ['数据分析', '商业']  应届毕业生   \n",
       "3    2020-03-16  广州  数据分析  本科            ['SQL', '数据分析', '数据库']   3-5年   \n",
       "4    2020-03-16  广州  数据分析  大专                          ['数据分析']   1-3年   \n",
       "..          ...  ..   ...  ..                               ...    ...   \n",
       "811  2020-03-16  深圳  数据开发  本科  ['Hive', '数据分析', '数据仓库', '信息安全']   3-5年   \n",
       "812  2020-03-16  深圳  数据开发  本科                  ['数据挖掘', '数据分析']     不限   \n",
       "815  2020-03-16  深圳  数据分析  本科      ['BI', '数据分析', 'SQL', '数据库']  5-10年   \n",
       "817  2020-03-16  深圳  数据分析  本科                                []   1-3年   \n",
       "824  2020-03-16  深圳  数据分析  本科                                []     不限   \n",
       "\n",
       "        薪资水平  \n",
       "0    20000.0  \n",
       "1    13000.0  \n",
       "2     8000.0  \n",
       "3    20000.0  \n",
       "4     8000.0  \n",
       "..       ...  \n",
       "811  37000.0  \n",
       "812  30000.0  \n",
       "815  45000.0  \n",
       "817  11500.0  \n",
       "824   3000.0  \n",
       "\n",
       "[629 rows x 7 columns]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1 = df\n",
    "df1['创建时间'] = df1['创建时间'].dt.strftime('%Y-%m-%d')\n",
    "df1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据交换"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>创建时间</th>\n",
       "      <th>岗位</th>\n",
       "      <th>地址</th>\n",
       "      <th>学历</th>\n",
       "      <th>技能要求</th>\n",
       "      <th>工作经验</th>\n",
       "      <th>薪资水平</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>广州</td>\n",
       "      <td>本科</td>\n",
       "      <td>['Hive', '数据挖掘', '数据分析']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>20000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>广州</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析', '数据库']</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>13000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>广州</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析', '商业']</td>\n",
       "      <td>应届毕业生</td>\n",
       "      <td>8000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>广州</td>\n",
       "      <td>本科</td>\n",
       "      <td>['SQL', '数据分析', '数据库']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>20000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>广州</td>\n",
       "      <td>大专</td>\n",
       "      <td>['数据分析']</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>8000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>811</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>深圳</td>\n",
       "      <td>本科</td>\n",
       "      <td>['Hive', '数据分析', '数据仓库', '信息安全']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>37000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>812</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>深圳</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据挖掘', '数据分析']</td>\n",
       "      <td>不限</td>\n",
       "      <td>30000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>815</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>深圳</td>\n",
       "      <td>本科</td>\n",
       "      <td>['BI', '数据分析', 'SQL', '数据库']</td>\n",
       "      <td>5-10年</td>\n",
       "      <td>45000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>817</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>深圳</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>11500.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>824</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>深圳</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>不限</td>\n",
       "      <td>3000.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>629 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           创建时间    岗位  地址  学历                              技能要求   工作经验  \\\n",
       "0    2020-03-16  数据开发  广州  本科          ['Hive', '数据挖掘', '数据分析']   3-5年   \n",
       "1    2020-03-16  数据分析  广州  本科                   ['数据分析', '数据库']   1-3年   \n",
       "2    2020-03-16  数据分析  广州  本科                    ['数据分析', '商业']  应届毕业生   \n",
       "3    2020-03-16  数据分析  广州  本科            ['SQL', '数据分析', '数据库']   3-5年   \n",
       "4    2020-03-16  数据分析  广州  大专                          ['数据分析']   1-3年   \n",
       "..          ...   ...  ..  ..                               ...    ...   \n",
       "811  2020-03-16  数据开发  深圳  本科  ['Hive', '数据分析', '数据仓库', '信息安全']   3-5年   \n",
       "812  2020-03-16  数据开发  深圳  本科                  ['数据挖掘', '数据分析']     不限   \n",
       "815  2020-03-16  数据分析  深圳  本科      ['BI', '数据分析', 'SQL', '数据库']  5-10年   \n",
       "817  2020-03-16  数据分析  深圳  本科                                []   1-3年   \n",
       "824  2020-03-16  数据分析  深圳  本科                                []     不限   \n",
       "\n",
       "        薪资水平  \n",
       "0    20000.0  \n",
       "1    13000.0  \n",
       "2     8000.0  \n",
       "3    20000.0  \n",
       "4     8000.0  \n",
       "..       ...  \n",
       "811  37000.0  \n",
       "812  30000.0  \n",
       "815  45000.0  \n",
       "817  11500.0  \n",
       "824   3000.0  \n",
       "\n",
       "[629 rows x 7 columns]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cols = df.columns[[0,2,1,3,4,5,6]]\n",
    "df1 = df[cols]\n",
    "df1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据合并"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>创建时间</th>\n",
       "      <th>地址</th>\n",
       "      <th>岗位</th>\n",
       "      <th>学历</th>\n",
       "      <th>技能要求</th>\n",
       "      <th>工作经验</th>\n",
       "      <th>薪资水平</th>\n",
       "      <th>合并列</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>['Hive', '数据挖掘', '数据分析']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>20000.0</td>\n",
       "      <td>广州数据开发</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析', '数据库']</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>13000.0</td>\n",
       "      <td>广州数据分析</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析', '商业']</td>\n",
       "      <td>应届毕业生</td>\n",
       "      <td>8000.0</td>\n",
       "      <td>广州数据分析</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['SQL', '数据分析', '数据库']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>20000.0</td>\n",
       "      <td>广州数据分析</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>大专</td>\n",
       "      <td>['数据分析']</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>8000.0</td>\n",
       "      <td>广州数据分析</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>811</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>['Hive', '数据分析', '数据仓库', '信息安全']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>37000.0</td>\n",
       "      <td>深圳数据开发</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>812</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据挖掘', '数据分析']</td>\n",
       "      <td>不限</td>\n",
       "      <td>30000.0</td>\n",
       "      <td>深圳数据开发</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>815</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['BI', '数据分析', 'SQL', '数据库']</td>\n",
       "      <td>5-10年</td>\n",
       "      <td>45000.0</td>\n",
       "      <td>深圳数据分析</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>817</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>11500.0</td>\n",
       "      <td>深圳数据分析</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>824</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>不限</td>\n",
       "      <td>3000.0</td>\n",
       "      <td>深圳数据分析</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>629 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           创建时间  地址    岗位  学历                              技能要求   工作经验  \\\n",
       "0    2020-03-16  广州  数据开发  本科          ['Hive', '数据挖掘', '数据分析']   3-5年   \n",
       "1    2020-03-16  广州  数据分析  本科                   ['数据分析', '数据库']   1-3年   \n",
       "2    2020-03-16  广州  数据分析  本科                    ['数据分析', '商业']  应届毕业生   \n",
       "3    2020-03-16  广州  数据分析  本科            ['SQL', '数据分析', '数据库']   3-5年   \n",
       "4    2020-03-16  广州  数据分析  大专                          ['数据分析']   1-3年   \n",
       "..          ...  ..   ...  ..                               ...    ...   \n",
       "811  2020-03-16  深圳  数据开发  本科  ['Hive', '数据分析', '数据仓库', '信息安全']   3-5年   \n",
       "812  2020-03-16  深圳  数据开发  本科                  ['数据挖掘', '数据分析']     不限   \n",
       "815  2020-03-16  深圳  数据分析  本科      ['BI', '数据分析', 'SQL', '数据库']  5-10年   \n",
       "817  2020-03-16  深圳  数据分析  本科                                []   1-3年   \n",
       "824  2020-03-16  深圳  数据分析  本科                                []     不限   \n",
       "\n",
       "        薪资水平     合并列  \n",
       "0    20000.0  广州数据开发  \n",
       "1    13000.0  广州数据分析  \n",
       "2     8000.0  广州数据分析  \n",
       "3    20000.0  广州数据分析  \n",
       "4     8000.0  广州数据分析  \n",
       "..       ...     ...  \n",
       "811  37000.0  深圳数据开发  \n",
       "812  30000.0  深圳数据开发  \n",
       "815  45000.0  深圳数据分析  \n",
       "817  11500.0  深圳数据分析  \n",
       "824   3000.0  深圳数据分析  \n",
       "\n",
       "[629 rows x 8 columns]"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['合并列'] = df['地址'] + df['岗位']\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据拆分"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>['Hive'</td>\n",
       "      <td>'数据挖掘'</td>\n",
       "      <td>'数据分析']</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>['数据分析'</td>\n",
       "      <td>'数据库']</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>['数据分析'</td>\n",
       "      <td>'商业']</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>['SQL'</td>\n",
       "      <td>'数据分析'</td>\n",
       "      <td>'数据库']</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>['数据分析']</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>811</th>\n",
       "      <td>['Hive'</td>\n",
       "      <td>'数据分析'</td>\n",
       "      <td>'数据仓库'</td>\n",
       "      <td>'信息安全']</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>812</th>\n",
       "      <td>['数据挖掘'</td>\n",
       "      <td>'数据分析']</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>815</th>\n",
       "      <td>['BI'</td>\n",
       "      <td>'数据分析'</td>\n",
       "      <td>'SQL'</td>\n",
       "      <td>'数据库']</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>817</th>\n",
       "      <td>[]</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>824</th>\n",
       "      <td>[]</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>629 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            0         1         2         3\n",
       "0     ['Hive'    '数据挖掘'   '数据分析']      None\n",
       "1     ['数据分析'    '数据库']      None      None\n",
       "2     ['数据分析'     '商业']      None      None\n",
       "3      ['SQL'    '数据分析'    '数据库']      None\n",
       "4    ['数据分析']      None      None      None\n",
       "..        ...       ...       ...       ...\n",
       "811   ['Hive'    '数据分析'    '数据仓库'   '信息安全']\n",
       "812   ['数据挖掘'   '数据分析']      None      None\n",
       "815     ['BI'    '数据分析'     'SQL'    '数据库']\n",
       "817        []      None      None      None\n",
       "824        []      None      None      None\n",
       "\n",
       "[629 rows x 4 columns]"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['技能要求'].str.split(',',expand=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据分组"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>薪资水平</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>学历</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>不限</th>\n",
       "      <td>18647.058824</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>大专</th>\n",
       "      <td>11775.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>本科</th>\n",
       "      <td>19537.950664</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>硕士</th>\n",
       "      <td>20134.615385</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            薪资水平\n",
       "学历              \n",
       "不限  18647.058824\n",
       "大专  11775.000000\n",
       "本科  19537.950664\n",
       "硕士  20134.615385"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.groupby(\"学历\").mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据计算"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "518"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df[df[\"薪资水平\"]>10000])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据统计"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count      627.000000\n",
       "mean     19019.138756\n",
       "std       9696.823558\n",
       "min       1000.000000\n",
       "25%      12500.000000\n",
       "50%      17500.000000\n",
       "75%      25000.000000\n",
       "max      60000.000000\n",
       "Name: 薪资水平, dtype: float64"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[\"薪资水平\"].describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据可视化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x11c084090>"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXMAAAD2CAYAAAAksGdNAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAARNUlEQVR4nO3dbYxcZ3mH8euOHBorW7YmTobIQAxNk6qwWKonCDW4mXWIIbFBTSskFCOwKF2ERMWLBU0qtWolKpyiVCQqolmhtjSlWqlFaknclMYhCwkkgqyQvQUJ4aJFcj6YWpFsNrggk7sfZly8yzg7O+fsvDxcP2nlM8+cl/ve2fP3mTNzZiIzkSSNt0uGXYAkqTrDXJIKYJhLUgEMc0kqgGEuSQXYNIyNbt26Nbdv377mfM899xyXX375xhc0ICX1U1IvYD+jrKReoFo/CwsLpzLzym73DSXMt2/fztNPP73mfPPz87RarY0vaEBK6qekXsB+RllJvUC1fiLi+xe7z9MsklQAw1ySCmCYS1IBDHNJKoBhLkkFMMwlqQCGuSQVwDCXpAIY5pJUgKFcAarxsf3Ow13HD06d48BF7qvD0qG9G7ZuqUQemUtSAQxzSSqAYS5JBTDMJakAvgC6Dhd7MbBX/b5o6IuBktbS05F5RFwaEQ+uGvtQRBzpTG+NiMcjYjEiDm1EoZKki1szzCNiM7AA3HLB2DXAgQtm+yBwGNgB3BoR19VbpiTphURm9jZjxPHMvLYz/a/ALPDhzHxjRHwN+MPMXIiI+4BvZeb9q5afAWYAGo3Gzrm5uTW3uby8zMTExLoa2kiLz5yutHxjM5w8u/7lprZNVtpuFRfrud9eejXonkftb62qkvopqReo1s/09PRCZja73bfuc+YRcQdwFPj2BcNXAOf3+jPAS1Yvl5mztP8DoNlsZi9fmzRqXxdV9SKZg1PnuGdx/S9TLO1vVdpuFRfrud9eejXonkftb62qkvopqRfYuH762Rv3Aa8A3gRcHxHvB04B5w+lJoGLfk+dJKl+6w7zzLwDICK2A5/JzL+OiKuAPRHxTeAm4N46i5QkvbC63md+H3AbcAw4nJnHa1qvJKkHPR+Zn3/x84LbS8AbO9OngF21ViZJ6plXgEpSAQxzSSqAYS5JBfCzWcZA1c+EkVQ+j8wlqQCGuSQVwDCXpAIY5pJUAMNckgpgmEtSAQxzSSqAYS5JBTDMJakAhrkkFcAwl6QCGOaSVADDXJIKYJhLUgEMc0kqgGEuSQXoKcwj4tKIeLAzHRHx2Yh4KiK+EBGbIuKyiHgoIo5GxAMRERtbtiTpQmuGeURsBhaAWzpDNwKbMvP1wIuBPcA7gBOZuQPYcsG8kqQBWDPMM/NsZr4WONEZOgnc25n+Seff3cAjnekvAdN1FilJemGRmb3NGHE8M6+94PbtwAeAm4F/Bz6RmUci4j3ADZn53lXLzwAzAI1GY+fc3Nya21xeXmZiYqLXXjbc4jOnKy3f2Awnz9ZUzJBtdC9T2yY3buVdjNrfWlUl9VNSL1Ctn+np6YXMbHa7r68vdI6It9IO8rdk5k8j4hRwfu+bBE6tXiYzZ4FZgGazma1Wa83tzM/P08t8g3Kg4hcrH5w6xz2LZXyH9kb3srS/tWHr7mbU/taqKqmfknqBjetn3e9miYiXAh8B9mbmDzvDj9I+dw7tUy6P1VOeJKkX/bw18V3A1cAXI+KJiHg38DlgW0QcA56lHe6SpAHp+Xny+fPlmXk3cHeXWfbVVZQkaX28aEiSCmCYS1IBDHNJKoBhLkkFMMwlqQCGuSQVoIzLEVWc7RWvtl2vg1Pn/v8K36VDewe6bakOHplLUgEMc0kqgGEuSQUwzCWpAIa5JBXAMJekAhjmklQAw1ySCmCYS1IBDHNJKoBhLkkFMMwlqQCGuSQVoKcwj4hLI+LBzvRlEfFQRByNiAei7efGNrZsSdKF1gzziNgMLAC3dIbeAZzIzB3Als54tzFJ0oCsGeaZeTYzXwuc6AztBh7pTH8JmL7ImCRpQPr5coorgNOd6TPA9RcZWyEiZoAZgEajwfz8/JobWl5e7mm+QTk4da7S8o3N1dcxKkrqBVb2M0p/c/0atX2nipJ6gY3rp58wPwVMdqYnO7cnuoytkJmzwCxAs9nMVqu15obm5+fpZb5BOVDx228OTp3jnsUyvtyppF5gZT9L+1vDLaYGo7bvVFFSL7Bx/fTzbpZHgT2d6d3AYxcZkyQNSD9h/jlgW0QcA56lHeTdxiRJA9Lz8+TMvLbz74+Bfavu7jYmSRoQLxqSpAIY5pJUAMNckgpgmEtSAQxzSSqAYS5JBTDMJakAhrkkFcAwl6QCGOaSVADDXJIKYJhLUgEMc0kqgGEuSQUwzCWpAIa5JBXAMJekAhjmklQAw1ySCmCYS1IBDHNJKkBfYR4Rl0fEv0XEVyPiLyNia0Q8HhGLEXGo7iIlSS+s3yPz/cBTmXkj8GrgfuAwsAO4NSKuq6k+SVIPIjPXv1DEu4BrgT8FjgC/AezLzIWIuA/4Vmbev2qZGWAGoNFo7Jybm1tzO8vLy0xMTKy7vo2y+MzpSss3NsPJszUVM2Ql9QIr+5naNjncYmowavtOFSX1AtX6mZ6eXsjMZrf7NvVZzz8BTwJvAx4FXgacT7ozwEtWL5CZs8AsQLPZzFarteZG5ufn6WW+QTlw5+FKyx+cOsc9i/3+ykdLSb3Ayn6W9reGW0wNRm3fqaKkXmDj+un3NMtdwN9k5q/TDu7rgPOHM5PAqRpqkyT1qN8w/2XgfzvTP6Z9lL4nIi4BbgIeq6E2SVKP+g3zTwHvi4gngc3A7cBtwDHgcGYer6k+SVIP+jrpmZlLwI2rhndVrkaS1BcvGpKkAhjmklQAw1ySCmCYS1IBDHNJKoBhLkkFMMwlqQCGuSQVwDCXpAIY5pJUAMNckgpgmEtSAQxzSSqAYS5JBTDMJakAhrkkFcAwl6QCGOaSVADDXJIKYJhLUgH6DvOI+GhEPB4RD0fEVZ3pxYg4VGeBkqS19RXmEfEq4NWZuQt4GPgkcBjYAdwaEdfVV6IkaS39HpnfDGyJiK8Au4BXAo9k5vPAl4HpmuqTJPUgMnP9C0X8MfCrmfn7EfEk8Drg+sw8HhEfA57LzI+vWmYGmAFoNBo75+bm1tzO8vIyExMT665voyw+c7rS8o3NcPJsTcUMWUm9wMp+prZNDreYGozavlNFSb1AtX6mp6cXMrPZ7b5NfdZzBvhOZ/p7wFXA+T1gEvj+6gUycxaYBWg2m9lqtdbcyPz8PL3MNygH7jxcafmDU+e4Z7HfX/loKakXWNnP0v7WcIupwajtO1WU1AtsXD/9nmZZAG7oTF9LO9j3RMQlwE3AYzXUJknqUV9hnplPAqci4hu0g/ydwG3AMeBwZh6vr0RJ0lr6fp6cme9bNbSrYi2SpD550ZAkFcAwl6QCGOaSVADDXJIKYJhLUgEMc0kqgGEuSQUwzCWpAIa5JBXAMJekAhjmklQAw1ySCmCYS1IBDHNJKoBhLkkFGLvv/dpe8avbJKlEHplLUgEMc0kqgGEuSQUwzCWpAJXCPCI+FBFHImJrRDweEYsRcaiu4iRJvek7zCPiGuBA5+YHgcPADuDWiLiuemmSpF5VOTK/F7irM70beCQznwe+DExXLUyS1LvIzPUvFHEHcD3wd8BngJcDezPzeER8DHguMz++apkZYAag0WjsnJubW3M7y8vLTExMrBhbfOb0uusdFY3NcPLssKuoR0m9wMp+prZNDreYGnTbd8ZVSb1AtX6mp6cXMrPZ7b5+LxraB7wCeBPtUH8eOL8HTALfX71AZs4CswDNZjNbrdaaG5mfn2f1fAfG+KKhg1PnuGdx7K7T6qqkXmBlP0v7W8Mtpgbd9p1xVVIvsHH99LU3ZuYdABGxnfaR+deAPRHxTeAm2qdgJEkDUtdbE+8DbgOOAYcz83hN65Uk9aDS8+TMXALe2Lm5q3I1kqS+eNGQJBXAMJekAhjmklQAw1ySCmCYS1IBDHNJKoBhLkkFMMwlqQCGuSQVwDCXpAIY5pJUAMNckgpgmEtSAQxzSSqAYS5JBTDMJakAhrkkFcAwl6QCGOaSVADDXJIKYJhLUgH6CvNo+2xEPBURX4iIiYh4KCKORsQDERF1FypJurh+j8xvBDZl5uuBFwPvBk5k5g5gC3BLTfVJknoQmbn+hSJ+DdiSmV+PiP8EmsAfZObnI+LDwJWZedeqZWaAGYBGo7Fzbm5uze0sLy8zMTGxYmzxmdPrrndUNDbDybPDrqIeJfUCK/uZ2jY53GJq0G3fGVcl9QLV+pmenl7IzGa3+zb1s8LM/C5ARNwOvAhYAM6n7Bng+i7LzAKzAM1mM1ut1prbmZ+fZ/V8B+483E/JI+Hg1DnuWezrVz5ySuoFVvaztL813GJq0G3fGVcl9QIb10/fL4BGxFuBDwBvAX4AnD+cmQROVS9NktSrfl8AfSnwEWBvZv4QeBTY07l7N/BYPeVJknrR75H5u4CrgS9GxBPApcC2iDgGPEs73CVJA9LvOfO7gbtXDd9fvRxJUj/KeQVLqsn2MX6R/byDU+d6frPA0qG9G1yNBsErQCWpAIa5JBXAMJekAhjmklQAw1ySCmCYS1IBDHNJKoBhLkkFMMwlqQCGuSQVwMv5pV9ww/z4Aj9KoD4emUtSAQxzSSqAYS5JBTDMJakAvgAq6RfOMF/0/fs3X74h6/XIXJIKYJhLUgEMc0kqQC3nzCPiMuBfgJcDx4B3ZmbWsW5J5erl3PV6vs/0F1ldR+bvAE5k5g5gC3BLTeuVJPWgrjDfDTzSmf4SMF3TeiVJPYg6zoZExBeBT2TmkYh4D3BDZr531TwzwEzn5vXAd3pY9VbgVOUCR0dJ/ZTUC9jPKCupF6jWzzWZeWW3O+p6n/kpYLIzPUmXQjNzFphdz0oj4unMbFYvbzSU1E9JvYD9jLKSeoGN66eu0yyPAns607uBx2parySpB3WF+eeAbRFxDHiWdrhLkgakltMsmfljYF8d61plXadlxkBJ/ZTUC9jPKCupF9igfmp5AVSSNFxeASpJBTDMJakAIxnmEXFZRDwUEUcj4oGIiGHXdDERcWlEPNiZ/rm6q4wNuI+IiM9GxFMR8YWImBjXXjr9bIqIf46Ir0bE347zY3NBTx+KiCMRsTUiHo+IxYg41Lmv77Eh9PHmiDgREU90fnYU8Nh8tPN7fTgirhrG4zOSYc6YfDxARGwGFvhZfd3qrjI2SDcCmzLz9cCLgXePcS8AvwMczcwbgauB91eofej9RMQ1wIHOzQ8Ch4EdwK0RcV3FsWH4dGa+ITPfANzAeD82rwJenZm7gIeBTzKEx2dUw3wsPh4gM89m5muBE52hbnVXGRukk8C9nemfAH/WpZ5x6QXgP4C/iohNwK8Av9mlpnHq517grs70buCRzHwe+DIX1Njn2DD8XkR8PSI+D9zMeD82NwNbIuIrwC7glQzh8RnVML8CON2ZPgO8ZIi1rEe3uquMDUxmfjczvx4RtwMvov2MYyx7AcjM5cz8EfBV2v9Rje1jExF3AEeBb3eGxraXjv8G/iQzX0f7WdPvdqlpnPq5EvifzPxt4GXA67rUtOH9jGqYr/nxACOqW91VxgYqIt4KfAB4C/CDHmsc1V6uiIhfAn6L9tPv1/RY5yj2s4/20d8csJP2Z3uMay/QvrDwSGd6CXi+S03j1M8ZfvZZU9+j3dPA+xnVMB/XjwfoVneVsYGJiJcCHwH2ZuYP11HjyPXScRB4W2b+FPgR8BddahqLfjLzjs655bfTfsb0KWBPRFwC3HRhjX2ODdqHgbd3angN7cdqLB+bjgXa5/0BrqUd7IN/fDJz5H6AXwIeov1FFw/QubhpVH+A4xeru8rYgHv4I+A48ETn573j2kunn220z6E+CfzjOD82F/S0nfYR7VbgceC/gI937ut7bAh9XA3MA98A/ryQx+bTnX7+YViPj1eASlIBRvU0iyRpHQxzSSqAYS5JBTDMJakAhrkkFcAwl6QC/B/sCuIiTXC7ywAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "%matplotlib inline\n",
    "df[\"薪资水平\"].hist()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据抽样"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>创建时间</th>\n",
       "      <th>地址</th>\n",
       "      <th>岗位</th>\n",
       "      <th>学历</th>\n",
       "      <th>技能要求</th>\n",
       "      <th>工作经验</th>\n",
       "      <th>薪资水平</th>\n",
       "      <th>合并列</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>200</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>杭州</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>['ETL']</td>\n",
       "      <td>不限</td>\n",
       "      <td>20000.0</td>\n",
       "      <td>杭州数据开发</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>324</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>北京</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['商业', '数据分析']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>27500.0</td>\n",
       "      <td>北京数据分析</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>562</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>上海</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>硕士</td>\n",
       "      <td>['数据分析']</td>\n",
       "      <td>不限</td>\n",
       "      <td>12500.0</td>\n",
       "      <td>上海数据分析</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>667</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>深圳</td>\n",
       "      <td>运营</td>\n",
       "      <td>本科</td>\n",
       "      <td>['产品']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>22500.0</td>\n",
       "      <td>深圳运营</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>481</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>上海</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['SQL', '数据分析']</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>11500.0</td>\n",
       "      <td>上海数据分析</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>767</th>\n",
       "      <td>2020-03-13</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析']</td>\n",
       "      <td>不限</td>\n",
       "      <td>10000.0</td>\n",
       "      <td>深圳数据开发</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>580</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>30000.0</td>\n",
       "      <td>深圳数据分析</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>广州</td>\n",
       "      <td>市场|营销</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析', '行业分析', '市场分析']</td>\n",
       "      <td>应届毕业生</td>\n",
       "      <td>4000.0</td>\n",
       "      <td>广州市场|营销</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>619</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>22500.0</td>\n",
       "      <td>深圳数据分析</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>788</th>\n",
       "      <td>2020-03-11</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析']</td>\n",
       "      <td>不限</td>\n",
       "      <td>10000.0</td>\n",
       "      <td>深圳数据开发</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>213</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>杭州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>大专</td>\n",
       "      <td>['数据分析', 'SPSS', 'SQL', '数据库']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>7500.0</td>\n",
       "      <td>杭州数据分析</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>243</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>杭州</td>\n",
       "      <td>后端开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据挖掘', '机器学习']</td>\n",
       "      <td>应届毕业生</td>\n",
       "      <td>3500.0</td>\n",
       "      <td>杭州后端开发</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>大专</td>\n",
       "      <td>['数据分析']</td>\n",
       "      <td>1-3年</td>\n",
       "      <td>8000.0</td>\n",
       "      <td>广州数据分析</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>704</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>35000.0</td>\n",
       "      <td>深圳数据分析</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>97</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>广州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析', '数据库', '商业', '可视化']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>20000.0</td>\n",
       "      <td>广州数据分析</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>219</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>杭州</td>\n",
       "      <td>高端产品职位</td>\n",
       "      <td>本科</td>\n",
       "      <td>[]</td>\n",
       "      <td>5-10年</td>\n",
       "      <td>32500.0</td>\n",
       "      <td>杭州高端产品职位</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>768</th>\n",
       "      <td>2020-03-13</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>不限</td>\n",
       "      <td>['数据库', '数据分析']</td>\n",
       "      <td>不限</td>\n",
       "      <td>15000.0</td>\n",
       "      <td>深圳数据分析</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>711</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>深圳</td>\n",
       "      <td>数据开发</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析']</td>\n",
       "      <td>应届毕业生</td>\n",
       "      <td>3500.0</td>\n",
       "      <td>深圳数据开发</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>605</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>深圳</td>\n",
       "      <td>运营</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析', '商务']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>13500.0</td>\n",
       "      <td>深圳运营</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>193</th>\n",
       "      <td>2020-03-16</td>\n",
       "      <td>杭州</td>\n",
       "      <td>数据分析</td>\n",
       "      <td>本科</td>\n",
       "      <td>['数据分析', 'SQL']</td>\n",
       "      <td>3-5年</td>\n",
       "      <td>15500.0</td>\n",
       "      <td>杭州数据分析</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           创建时间  地址      岗位  学历                            技能要求   工作经验  \\\n",
       "200  2020-03-16  杭州    数据开发  本科                         ['ETL']     不限   \n",
       "324  2020-03-16  北京    数据分析  本科                  ['商业', '数据分析']   3-5年   \n",
       "562  2020-03-16  上海    数据分析  硕士                        ['数据分析']     不限   \n",
       "667  2020-03-16  深圳      运营  本科                          ['产品']   3-5年   \n",
       "481  2020-03-16  上海    数据分析  本科                 ['SQL', '数据分析']   1-3年   \n",
       "767  2020-03-13  深圳    数据开发  本科                        ['数据分析']     不限   \n",
       "580  2020-03-16  深圳    数据分析  本科                        ['数据分析']   3-5年   \n",
       "104  2020-03-16  广州   市场|营销  本科        ['数据分析', '行业分析', '市场分析']  应届毕业生   \n",
       "619  2020-03-16  深圳    数据分析  本科                        ['数据分析']   3-5年   \n",
       "788  2020-03-11  深圳    数据开发  本科                        ['数据分析']     不限   \n",
       "213  2020-03-16  杭州    数据分析  大专  ['数据分析', 'SPSS', 'SQL', '数据库']   3-5年   \n",
       "243  2020-03-16  杭州    后端开发  本科                ['数据挖掘', '机器学习']  应届毕业生   \n",
       "4    2020-03-16  广州    数据分析  大专                        ['数据分析']   1-3年   \n",
       "704  2020-03-16  深圳    数据分析  本科                              []   3-5年   \n",
       "97   2020-03-16  广州    数据分析  本科    ['数据分析', '数据库', '商业', '可视化']   3-5年   \n",
       "219  2020-03-16  杭州  高端产品职位  本科                              []  5-10年   \n",
       "768  2020-03-13  深圳    数据分析  不限                 ['数据库', '数据分析']     不限   \n",
       "711  2020-03-16  深圳    数据开发  本科                        ['数据分析']  应届毕业生   \n",
       "605  2020-03-16  深圳      运营  本科                  ['数据分析', '商务']   3-5年   \n",
       "193  2020-03-16  杭州    数据分析  本科                 ['数据分析', 'SQL']   3-5年   \n",
       "\n",
       "        薪资水平       合并列  \n",
       "200  20000.0    杭州数据开发  \n",
       "324  27500.0    北京数据分析  \n",
       "562  12500.0    上海数据分析  \n",
       "667  22500.0      深圳运营  \n",
       "481  11500.0    上海数据分析  \n",
       "767  10000.0    深圳数据开发  \n",
       "580  30000.0    深圳数据分析  \n",
       "104   4000.0   广州市场|营销  \n",
       "619  22500.0    深圳数据分析  \n",
       "788  10000.0    深圳数据开发  \n",
       "213   7500.0    杭州数据分析  \n",
       "243   3500.0    杭州后端开发  \n",
       "4     8000.0    广州数据分析  \n",
       "704  35000.0    深圳数据分析  \n",
       "97   20000.0    广州数据分析  \n",
       "219  32500.0  杭州高端产品职位  \n",
       "768  15000.0    深圳数据分析  \n",
       "711   3500.0    深圳数据开发  \n",
       "605  13500.0      深圳运营  \n",
       "193  15500.0    杭州数据分析  "
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.sample(20)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据透视表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>薪资水平</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>地址</th>\n",
       "      <th>学历</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"4\" valign=\"top\">上海</th>\n",
       "      <th>不限</th>\n",
       "      <td>17000.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>大专</th>\n",
       "      <td>10000.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>本科</th>\n",
       "      <td>19464.285714</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>硕士</th>\n",
       "      <td>20642.857143</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"4\" valign=\"top\">北京</th>\n",
       "      <th>不限</th>\n",
       "      <td>22833.333333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>大专</th>\n",
       "      <td>19000.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>本科</th>\n",
       "      <td>23726.086957</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>硕士</th>\n",
       "      <td>27500.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"4\" valign=\"top\">广州</th>\n",
       "      <th>不限</th>\n",
       "      <td>17750.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>大专</th>\n",
       "      <td>10700.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>本科</th>\n",
       "      <td>14916.666667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>硕士</th>\n",
       "      <td>7666.666667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"4\" valign=\"top\">杭州</th>\n",
       "      <th>不限</th>\n",
       "      <td>17733.333333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>大专</th>\n",
       "      <td>16625.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>本科</th>\n",
       "      <td>20753.623188</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>硕士</th>\n",
       "      <td>14000.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"4\" valign=\"top\">深圳</th>\n",
       "      <th>不限</th>\n",
       "      <td>18833.333333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>大专</th>\n",
       "      <td>10437.500000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>本科</th>\n",
       "      <td>19313.793103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>硕士</th>\n",
       "      <td>23150.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               薪资水平\n",
       "地址 学历              \n",
       "上海 不限  17000.000000\n",
       "   大专  10000.000000\n",
       "   本科  19464.285714\n",
       "   硕士  20642.857143\n",
       "北京 不限  22833.333333\n",
       "   大专  19000.000000\n",
       "   本科  23726.086957\n",
       "   硕士  27500.000000\n",
       "广州 不限  17750.000000\n",
       "   大专  10700.000000\n",
       "   本科  14916.666667\n",
       "   硕士   7666.666667\n",
       "杭州 不限  17733.333333\n",
       "   大专  16625.000000\n",
       "   本科  20753.623188\n",
       "   硕士  14000.000000\n",
       "深圳 不限  18833.333333\n",
       "   大专  10437.500000\n",
       "   本科  19313.793103\n",
       "   硕士  23150.000000"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.pivot_table(df,index=[\"地址\",\"学历\"],values=[\"薪资水平\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## VLOOKUP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>序号</th>\n",
       "      <th>科目</th>\n",
       "      <th>成绩</th>\n",
       "      <th>排名</th>\n",
       "      <th>Unnamed: 4</th>\n",
       "      <th>序号.1</th>\n",
       "      <th>科目.1</th>\n",
       "      <th>成绩.1</th>\n",
       "      <th>排名.1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A</td>\n",
       "      <td>语文</td>\n",
       "      <td>80</td>\n",
       "      <td>8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>C</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>B</td>\n",
       "      <td>数学</td>\n",
       "      <td>70</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>C</td>\n",
       "      <td>英语</td>\n",
       "      <td>60</td>\n",
       "      <td>4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>D</td>\n",
       "      <td>政治</td>\n",
       "      <td>50</td>\n",
       "      <td>5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>E</td>\n",
       "      <td>地理</td>\n",
       "      <td>90</td>\n",
       "      <td>6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>F</td>\n",
       "      <td>化学</td>\n",
       "      <td>100</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>G</td>\n",
       "      <td>生物</td>\n",
       "      <td>77</td>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  序号  科目   成绩  排名  Unnamed: 4 序号.1  科目.1  成绩.1  排名.1\n",
       "0  A  语文   80   8         NaN    C   NaN   NaN   NaN\n",
       "1  B  数学   70   2         NaN  NaN   NaN   NaN   NaN\n",
       "2  C  英语   60   4         NaN  NaN   NaN   NaN   NaN\n",
       "3  D  政治   50   5         NaN  NaN   NaN   NaN   NaN\n",
       "4  E  地理   90   6         NaN  NaN   NaN   NaN   NaN\n",
       "5  F  化学  100   1         NaN  NaN   NaN   NaN   NaN\n",
       "6  G  生物   77   3         NaN  NaN   NaN   NaN   NaN"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1 = pd.read_excel(\"vlookup.xlsx\")\n",
    "df1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>序号</th>\n",
       "      <th>科目</th>\n",
       "      <th>成绩</th>\n",
       "      <th>排名</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A</td>\n",
       "      <td>语文</td>\n",
       "      <td>80</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>B</td>\n",
       "      <td>数学</td>\n",
       "      <td>70</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>C</td>\n",
       "      <td>英语</td>\n",
       "      <td>60</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>D</td>\n",
       "      <td>政治</td>\n",
       "      <td>50</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>E</td>\n",
       "      <td>地理</td>\n",
       "      <td>90</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>F</td>\n",
       "      <td>化学</td>\n",
       "      <td>100</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>G</td>\n",
       "      <td>生物</td>\n",
       "      <td>77</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  序号  科目   成绩  排名\n",
       "0  A  语文   80   8\n",
       "1  B  数学   70   2\n",
       "2  C  英语   60   4\n",
       "3  D  政治   50   5\n",
       "4  E  地理   90   6\n",
       "5  F  化学  100   1\n",
       "6  G  生物   77   3"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2 = df1[[\"序号\",\"科目\",\"成绩\",\"排名\"]]\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>序号</th>\n",
       "      <th>科目</th>\n",
       "      <th>成绩</th>\n",
       "      <th>排名</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>C</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  序号  科目  成绩  排名\n",
       "0  C NaN NaN NaN"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df3 = df1[[\"序号.1\",\"科目.1\",\"成绩.1\",\"排名.1\"]]\n",
    "df3.columns = [\"序号\",\"科目\",\"成绩\",\"排名\"]\n",
    "df3 = df3.loc[0:0]\n",
    "df3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>科目</th>\n",
       "      <th>成绩</th>\n",
       "      <th>排名</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>序号</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>A</th>\n",
       "      <td>语文</td>\n",
       "      <td>80</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>B</th>\n",
       "      <td>数学</td>\n",
       "      <td>70</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C</th>\n",
       "      <td>英语</td>\n",
       "      <td>60</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>D</th>\n",
       "      <td>政治</td>\n",
       "      <td>50</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>E</th>\n",
       "      <td>地理</td>\n",
       "      <td>90</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>F</th>\n",
       "      <td>化学</td>\n",
       "      <td>100</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>G</th>\n",
       "      <td>生物</td>\n",
       "      <td>77</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    科目   成绩  排名\n",
       "序号             \n",
       "A   语文   80   8\n",
       "B   数学   70   2\n",
       "C   英语   60   4\n",
       "D   政治   50   5\n",
       "E   地理   90   6\n",
       "F   化学  100   1\n",
       "G   生物   77   3"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2 = df2.set_index(\"序号\")\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>科目</th>\n",
       "      <th>成绩</th>\n",
       "      <th>排名</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>序号</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>C</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    科目  成绩  排名\n",
       "序号            \n",
       "C  NaN NaN NaN"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df3 = df3.set_index(\"序号\")\n",
    "df3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>科目</th>\n",
       "      <th>成绩</th>\n",
       "      <th>排名</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>序号</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>C</th>\n",
       "      <td>英语</td>\n",
       "      <td>60.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    科目    成绩   排名\n",
       "序号               \n",
       "C   英语  60.0  4.0"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df3.update(df2)\n",
    "df3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
